{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "iImkWEpRSiRq"
      },
      "outputs": [],
      "source": [
        "\n",
        "# Load libraries\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "from sklearn.datasets import load_iris, make_regression\n",
        "from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectPercentile, VarianceThreshold, RFECV\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "import warnings\n",
        "from sklearn import datasets, linear_model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZEK7KAyzSokS",
        "outputId": "7ce72382-c116-4f51-df7b-1f975c1c25f8"
      },
      "outputs": [],
      "source": [
        "# Load libraries\n",
        "# import data\n",
        "iris = datasets.load_iris()\n",
        "# Create features and target\n",
        "features_i = iris.data\n",
        "target_i = iris.target\n",
        "# thresholder  creation\n",
        "thresholder = VarianceThreshold(threshold=.4)\n",
        "# high variance feature matrix creation\n",
        "f_high_variance = thresholder.fit_transform(features_i)\n",
        "# View high variance feature matrix\n",
        "f_high_variance[0:3]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7ZZgOg1-SpuX",
        "outputId": "a869adde-0b29-4630-9661-34377f110d4f"
      },
      "outputs": [],
      "source": [
        "# View variances\n",
        "thresholder.fit(features_i).variances_"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zYNK4wP5Sq9R",
        "outputId": "30e18ea5-4b63-43e5-819e-9a99251dfae6"
      },
      "outputs": [],
      "source": [
        "\n",
        "# feature matrix stantardization\n",
        "scaler = StandardScaler()\n",
        "f_std = scaler.fit_transform(features_i)\n",
        "# variance of each feature calculation\n",
        "selection = VarianceThreshold()\n",
        "selection.fit(f_std).variances_"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jDGMP97LSuiB",
        "outputId": "c1b9d537-495f-4109-ef75-324fe9943668"
      },
      "outputs": [],
      "source": [
        "# feature matrix creation with:\n",
        "# for Feature 0: 80% class 0\n",
        "# for Feature 1: 80% class 1\n",
        "# for Feature 2: 60% class 0, 40% class 1\n",
        "features_i = [[0, 2, 0],\n",
        "[0, 1, 1],\n",
        "[0, 1, 0],\n",
        "[0, 1, 1],\n",
        "[1, 0, 0]]\n",
        "# threshold by variance\n",
        "thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))\n",
        "thresholding.fit_transform(features_i)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 198
        },
        "id": "JvnObeKXS6xm",
        "outputId": "19dac143-9407-4bb4-cc23-b19b06025617"
      },
      "outputs": [],
      "source": [
        "# Create feature matrix with two highly correlated features\n",
        "features_m = np.array([[1, 1, 1],\n",
        "[2, 2, 0],\n",
        "[3, 3, 1],\n",
        "[4, 4, 0],\n",
        "[5, 5, 1],\n",
        "[6, 6, 0],\n",
        "[7, 7, 1],\n",
        "[8, 7, 0],\n",
        "[9, 7, 1]])\n",
        "# Conversion of  feature matrix\n",
        "dataframe = pd.DataFrame(features_m)\n",
        "# correlation matrix creation\n",
        "corr_m = dataframe.corr().abs()\n",
        "# upper triangle selection\n",
        "upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),\n",
        "k=1).astype(np.bool))\n",
        "# For correlation greater than 0.85, Find index of feature columns\n",
        "droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]\n",
        "# Drop features\n",
        "dataframe.drop(dataframe.columns[droping], axis=1).head(3)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Dos1ZfkDS-Zd",
        "outputId": "17e96f0d-a55a-4943-90a9-99aa3c31fad3"
      },
      "outputs": [],
      "source": [
        "# Load data\n",
        "iris_i = load_iris()\n",
        "features_v = iris.data\n",
        "target = iris.target\n",
        "# categorical data coversion\n",
        "features_v = features_v.astype(int)\n",
        "# Selection of two features using highest chi-squared \n",
        "chi2_s = SelectKBest(chi2, k=2)\n",
        "f_kbest = chi2_s.fit_transform(features_v, target)\n",
        "# Show results\n",
        "print(\"Original number of features:\", features_v.shape[1])\n",
        "print(\"Reduced number of features:\", f_kbest.shape[1])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "y10u_gQbTCwR",
        "outputId": "651182ab-d857-4a3d-db61-4fff866d167c"
      },
      "outputs": [],
      "source": [
        "# Selection of two features using highest F-values\n",
        "f_selector = SelectKBest(f_classif, k=2)\n",
        "f_kbest = f_selector.fit_transform(features_v, target)\n",
        "# Pisplay results\n",
        "print(\"Original number of features:\", features_v.shape[1])\n",
        "print(\"Reduced number of features:\", f_kbest.shape[1])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5NXAa6UKTHiu",
        "outputId": "c34866b2-c08c-4020-b14d-78deb98f2834"
      },
      "outputs": [],
      "source": [
        "# Selection of top 65% of features \n",
        "f_selector = SelectPercentile(f_classif, percentile=65)\n",
        "f_kbest = f_selector.fit_transform(features_v, target)\n",
        "# Display results\n",
        "print(\"Original number of features:\", features_v.shape[1])\n",
        "print(\"Reduced number of features:\", f_kbest.shape[1])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "39-Wq-F9TKVg",
        "outputId": "e52c0537-2245-4f12-ea9a-ace232984ec1"
      },
      "outputs": [],
      "source": [
        "# Load libraries\n",
        "# Suppress an annoying but harmless warning\n",
        "warnings.filterwarnings(action=\"ignore\", module=\"scipy\",\n",
        "message=\"^internal gelsd\")\n",
        "#  features matrix, target vector, true coefficients\n",
        "features_f, target_t = make_regression(n_samples = 10000,\n",
        "n_features = 100,\n",
        "n_informative = 2,\n",
        "random_state = 1)\n",
        "# linear regression creation\n",
        "ols = linear_model.LinearRegression()\n",
        "# Recursive features elimination\n",
        "rfecv = RFECV(estimator=ols, step=2, scoring=\"neg_mean_squared_error\")\n",
        "rfecv.fit(features_f, target_t)\n",
        "rfecv.transform(features_f)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Ut1mgIGEUhJM",
        "outputId": "f365a4d5-63f4-4a55-e828-d331e6f06308"
      },
      "outputs": [],
      "source": [
        "# Number of best features\n",
        "rfecv.n_features_"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Lpt7I_Q0UjN1",
        "outputId": "4d6938dc-d813-42a5-c1b7-9ba4865a0e86"
      },
      "outputs": [],
      "source": [
        "# What the best categories ?\n",
        "rfecv.support_"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ojYKsEbTUkMu",
        "outputId": "98652d92-f58f-41fe-9ba1-b1ecd3ef7ecb"
      },
      "outputs": [],
      "source": [
        "# We can even see how the features are ranked\n",
        "rfecv.ranking_"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "Untitled42.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
